Floormod
逐元素计算两个输入张量的 floor-modulus。
\[\text{output}_i = \text{input0}_i - \lfloor \frac{\text{input0}_i}{\text{input1}_i} \rfloor \cdot \text{input1}_i\]
其中 \(\lfloor \cdot \rfloor\) 表示向下取整 (floor) 操作。
- 输入:
input0 - 第一个输入张量(被除数)的数据地址。
input1 - 第二个输入张量(除数)的数据地址。
- params - 参数打包,格式如下。
input0_dims - input0的维度信息。
input1_dims - input1的维度信息。
output_dims - output的维度信息。
strides0 - 输入张量0的步长信息。
strides1 - 输入张量1的步长信息。
strides_output - 输出张量的步长信息。
num_dims - 张量的维度数。
core_mask - 核掩码。
- 输出:
output - 输出张量的数据地址,其大小与输入张量相同。
- 支持平台:
FT78NEMT7004
备注
FT78NE 支持fp32
MT7004 支持fp16, fp32
共享存储版本:
-
void fp_floor_mod_s(float *input0, float *input1, float *output, long long *params, int core_mask)
-
void hp_floor_mod_s(half *input0, half *input1, half *output, long long *params, int core_mask)
-
void dp_floor_mod_s(double *input0, double *input1, double *output, long long *params, int core_mask)
C调用示例:
1//FT78NE示例
2#include <stdio.h>
3#include <floormod.h>
4int main(int argc, char* argv[]) {
5 float* input0 = (float*)0x81000000;
6 float* input1 = (float*)0x82000000;
7 float* output = (float*)0x83000000;
8 int *strides0 = (int*)0x84000000;
9 int *strides1 = (int*)0x85000000;
10 int *strides_output = (int*)0x86000000;
11
12 int core_mask = 0b1111;
13
14 // same shape
15 int input0_dims[] = {4, 1, 16}; // 2x2
16 int input1_dims[] = {4, 8, 16}; // 2x2
17 int output_dims[] = {4, 8, 16}; // 2x2
18 int num_dims = 3;
19
20 unsigned long long params[9];
21 params[0] = (unsigned long long)input0_dims;
22 params[1] = (unsigned long long)input1_dims;
23 params[2] = (unsigned long long)output_dims;
24 params[3] = (unsigned long long)strides0;
25 params[4] = (unsigned long long)strides1;
26 params[5] = (unsigned long long)strides_output;
27 params[6] = (unsigned long long)num_dims;
28
29 int total_input0 = get_total_elements(num_dims, input0_dims);
30 int total_input1 = get_total_elements(num_dims, input1_dims);
31 int total_output = get_total_elements(num_dims, output_dims);
32
33 srand(time(0));
34
35 int i;
36 for (i = 0; i < total_input0; ++i) {
37 input0[i] = (float)(rand() % 100) / 10.0f;
38 }
39
40 for (i = 0; i < total_input1; ++i) {
41 input1[i] = (float)(rand() % 100) / 10.0f + 0.01f;
42 }
43
44 fp_floor_mod_s(input0, input1, output, params, core_mask);
45 return 0;
46}
私有存储版本:
-
void fp_floor_mod_p(float *input0, float *input1, float *output, long long *params)
-
void hp_floor_mod_p(half *input0, half *input1, half *output, long long *params)
-
void dp_floor_mod_p(double *input0, double *input1, double *output, long long *params)
C调用示例:
1//FT78NE示例
2#include <stdio.h>
3#include <floormod.h>
4int main(int argc, char* argv[]) {
5 float* input0 = (float*)0x10010000;
6 float* input1 = (float*)0x10020000;
7 float* output = (float*)0x10030000;
8 int *strides0 = (int*)0x10050000;
9 int *strides1 = (int*)0x10053000;
10 int *strides_output = (int*)0x10056000;
11
12 // same shape
13 int input0_dims[] = {4, 1, 16}; // 2x2
14 int input1_dims[] = {4, 8, 16}; // 2x2
15 int output_dims[] = {4, 8, 16}; // 2x2
16 int num_dims = 3;
17
18 unsigned long long params[9];
19 params[0] = (unsigned long long)input0_dims;
20 params[1] = (unsigned long long)input1_dims;
21 params[2] = (unsigned long long)output_dims;
22 params[3] = (unsigned long long)strides0;
23 params[4] = (unsigned long long)strides1;
24 params[5] = (unsigned long long)strides_output;
25 params[6] = (unsigned long long)num_dims;
26
27 int total_input0 = get_total_elements(num_dims, input0_dims);
28 int total_input1 = get_total_elements(num_dims, input1_dims);
29 int total_output = get_total_elements(num_dims, output_dims);
30
31 srand(time(0));
32
33 int i;
34 for (i = 0; i < total_input0; ++i) {
35 input0[i] = (float)(rand() % 100) / 10.0f;
36 }
37
38 for (i = 0; i < total_input1; ++i) {
39 input1[i] = (float)(rand() % 100) / 10.0f + 0.01f;
40 }
41
42 fp_floor_mod_p(input0, input1, output, params);
43 return 0;
44}